package com.yzq.os.spider.v.service.spider;

import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.net.URI;
import java.net.URISyntaxException;
import java.net.URLDecoder;
import java.net.URLEncoder;
import java.text.ParseException;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.Date;
import java.util.Iterator;
import java.util.List;
import java.util.concurrent.TimeUnit;

import org.apache.commons.collections.CollectionUtils;
import org.apache.commons.lang.ArrayUtils;
import org.apache.commons.lang.StringUtils;
import org.apache.commons.lang.time.DateUtils;
import org.apache.http.Header;
import org.apache.http.NameValuePair;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.utils.URIUtils;
import org.apache.http.client.utils.URLEncodedUtils;
import org.apache.http.message.BasicHeader;
import org.apache.http.message.BasicNameValuePair;
import org.apache.log4j.Logger;
import org.springframework.util.StopWatch;

import com.yzq.os.spider.v.domain.Record;
import com.yzq.os.spider.v.domain.ListPageConfig;
import com.yzq.os.spider.v.domain.QueryURL;
import com.yzq.os.spider.v.domain.SearchEngine;
import com.yzq.os.spider.v.service.CrawlService;
import com.yzq.os.spider.v.service.domain.SpiderRecordService;
import com.yzq.os.spider.v.service.domain.QueryURLService;
import com.yzq.os.spider.v.service.domain.SearchEngineParamService;
import com.yzq.os.spider.v.service.domain.SearchEngineService;
import com.yzq.os.spider.v.service.http.HttpClientService;
import com.yzq.os.spider.v.service.queryurl.CreateQueryURL;
import com.yzq.os.spider.v.util.Encode;
import com.yzq.os.spider.v.util.EncodeUtil;
import com.yzq.os.spider.v.util.MD5;
import com.yzq.os.spider.v.util.Page;
import com.yzq.os.spider.v.util.Regex;

/**
 * 抽象抓取类
 * 
 * @author 苑志强(xingyu_yzq@163.com)
 * 
 */
public abstract class AbstractCrawlTask implements SpiderTask {

	protected static final int FIRST_PAGE_NO = 1;

	protected static final int SECOND_PAGE_NO = 2;

	protected Logger logger = Logger.getLogger(getClass());

	protected SearchEngineService searchEngineService;

	protected SearchEngineParamService searchEngineParamService;

	protected QueryURLService queryURLService;

	protected SpiderRecordService crawJobService;

	protected SearchEngine searchEngine;

	protected ListPageConfig listPageConfig;

	protected QueryURL queryURL;

	protected HttpClientService httpClientService;

	protected String urlEncode;

	protected String pageHtmlSource;

	protected String jobsPartHtmlSource;

	protected List<Record> jobLists;

	protected Date crawlDate;

	protected String tableNameForSaveJobs;

	private List<Header> resetHeader = new ArrayList<Header>();

	private boolean forceUseCalculateTurnPage = false;

	@Override
	public void setSearchEngineService(SearchEngineService searchEngineService) {
		this.searchEngineService = searchEngineService;
	}

	@Override
	public void setSearchEngineParamService(
			SearchEngineParamService searchEngineParamService) {
		this.searchEngineParamService = searchEngineParamService;
	}

	@Override
	public void setQueryURLService(QueryURLService queryURLService) {
		this.queryURLService = queryURLService;
	}

	@Override
	public void setSpiderRecordService(SpiderRecordService crawJobService) {
		this.crawJobService = crawJobService;
	}

	@Override
	public void setSearchEngine(SearchEngine searchEngine) {
		this.searchEngine = searchEngine;
		urlEncode = SearchEngineService.getEncode(searchEngine.getUrlEncode());
	}

	@Override
	public void setCrawlDate(Date crawlDate) {
		this.crawlDate = crawlDate;
	}

	@Override
	public void setTableName(String tableName) {
		this.tableNameForSaveJobs = tableName;
	}

	@Override
	public void setQueryURL(QueryURL queryURL) {
		this.queryURL = queryURL;
	}

	@Override
	public void setListPageConfig(ListPageConfig listPageConfig) {
		this.listPageConfig = listPageConfig;
	}

	@Override
	public void setHttpClientService(HttpClientService httpClientService) {
		this.httpClientService = httpClientService;
	}

	/**
	 * 抓取前置处理方法
	 */
	@Override
	public void initializationBeforeRun() {
	}

	/**
	 * 抓取方法
	 */
	@Override
	public void run() {
		initializationBeforeRun();
		queryURL.setDoFlag(QueryURL.OK);
		Integer eId = searchEngine.getId();
		String eName = searchEngine.getName();
		int sleepTime = searchEngine.getSleepTime();
		int qId = queryURL.getId();
		try {
			if (sleepTime != 0) {
				synchronized (this.getClass()) {
					logger.info(EncodeUtil.gbk2iso("[" + eName + "]:[" + qId
							+ "] will sleep time(milliseconds):[" + sleepTime
							+ "] ."));
					TimeUnit.MILLISECONDS.sleep(sleepTime);
				}
			}
			queryURL.setSpellUrl(searchEngineParamService.replaceHolder(
					queryURL.getSpellUrl(), urlEncode));
			queryURL.setPostUrl(searchEngineParamService.replaceHolder(
					queryURL.getPostUrl(), urlEncode));
			reWriteQueryUrl();
			pageHtmlSource = crawlPostUrlHtmlSource();// 抓取HTML页面
			if (StringUtils.isNotBlank(pageHtmlSource)) {
				if (!isNoSearchResultsPage()) {
					processResultPage(eName, qId);
				} else {
					queryURL.setDoFlag(QueryURL.ERROR_NO_FOUNDED_JOB_PAGE);
					logger.info(EncodeUtil.gbk2iso("[" + eName + "]:[" + qId
							+ "] not founded job page!  html length:["
							+ StringUtils.length(pageHtmlSource) + "]."));
				}
			} else {
				queryURL.setDoFlag(QueryURL.ERROR_HTML_PAGE_SOURCE_BLANK);
				logger.info(EncodeUtil.gbk2iso("[" + eName + "]:[" + qId
						+ "] html source is blank! "));
			}
		} catch (ClientProtocolException e) {
			queryURL.setDoFlag(QueryURL.ERROR_NETWORK_ACCESS);
			logger.error(
					EncodeUtil.gbk2iso("ClientProtocolException!  [" + eName
							+ "]:[" + qId + "]."), e);
		} catch (IOException e) {
			queryURL.setDoFlag(QueryURL.ERROR_NETWORK_ACCESS);
			logger.error(
					EncodeUtil.gbk2iso("IOException!  [" + eName + "]:[" + qId
							+ "]."), e);
		} catch (Exception e) {
			queryURL.setDoFlag(QueryURL.ERROR_OTHER);
			logger.error(
					EncodeUtil.gbk2iso("Exception!  [" + eName + "]:[" + qId
							+ "]. html:[" + EncodeUtil.gbk2iso(pageHtmlSource)
							+ "]"), e);
		} finally {
			CrawlService.addQueryCount(eId, 1);
			queryURLService.updateDoFlag(eId, queryURL);
		}
		logger.info(EncodeUtil.gbk2iso("[" + eName + "]:[" + qId
				+ "] crawl run method finish."));
	}

	/**
	 * 重新装饰提交URL的扩展方法
	 */
	@Override
	public void reWriteQueryUrl() {
	}

	/**
	 * 抓取HTML代码
	 * 
	 * @return
	 * @throws Exception
	 */
	@Override
	public String crawlPostUrlHtmlSource() throws Exception {
		String html = null;
		StopWatch watch = new StopWatch();
		watch.start();
		if (searchEngine.getMethod() == SearchEngine.METHOD_POST) {
			html = doPostCrawlHtmlSource();
		} else {
			html = doGetCrawlHtmlSource();
		}
		watch.stop();
		logger.info("[" + EncodeUtil.gbk2iso(searchEngine.getName()) + "]:["
				+ queryURL.getId() + "] http accessed !. cost:["
				+ watch.getTotalTimeMillis() + "] ");
		return html;
	}

	/**
	 * GET方式获取HTML代码
	 * 
	 * @return
	 * @throws Exception
	 */
	protected String doGetCrawlHtmlSource() throws Exception {
		boolean isGzip = searchEngine.getIsGzip() == 1 ? true : false;
		return httpClientService.doGetRequest(queryURL.getPostUrl(), isGzip,
				resetHeader);
	}

	/**
	 * POST方式获取HTML代码
	 * 
	 * @return
	 * @throws Exception
	 */
	protected String doPostCrawlHtmlSource() throws Exception {
		String url = queryURL.getPostUrl();
		URI uri = new URI(url);
		List<NameValuePair> parameters = URLEncodedUtils.parse(uri, urlEncode);
		parameters = decodePostParams(parameters, urlEncode);
		url = StringUtils.substringBefore(uri.toString(), "?");
		String encode = SearchEngineService.getEncode(searchEngine.getEncode());
		boolean isGzip = searchEngine.getIsGzip() == 1 ? true : false;
		return httpClientService.doPostRequest(url, Encode.getEncode(encode),
				isGzip, resetHeader, parameters, urlEncode);
	}

	/**
	 * 解码参数
	 * 
	 * @param parameters
	 * @param enc
	 * @return
	 * @throws UnsupportedEncodingException
	 */
	protected List<NameValuePair> decodePostParams(
			List<NameValuePair> parameters, String enc)
			throws UnsupportedEncodingException {
		if (CollectionUtils.isNotEmpty(parameters)) {
			List<NameValuePair> returnPairs = new ArrayList<NameValuePair>();
			for (NameValuePair pair : parameters) {
				returnPairs.add(new BasicNameValuePair(pair.getName(),
						URLDecoder.decode(
								pair.getValue() != null ? pair.getValue() : "",
								enc)));
			}
		}
		return parameters;
	}

	/**
	 * 编码参数
	 * 
	 * @param parameters
	 * @param enc
	 * @return
	 * @throws UnsupportedEncodingException
	 */
	protected List<NameValuePair> encodePostParams(
			List<NameValuePair> parameters, String enc)
			throws UnsupportedEncodingException {
		if (CollectionUtils.isNotEmpty(parameters)) {
			List<NameValuePair> returnPairs = new ArrayList<NameValuePair>();
			for (NameValuePair pair : parameters) {
				returnPairs.add(new BasicNameValuePair(pair.getName(),
						URLEncoder.encode(
								pair.getValue() != null ? pair.getValue() : "",
								enc)));
			}
		}
		return parameters;
	}

	@Override
	public void setPostUrlHtmlSource(String postUrlHtmlSource) {
		this.pageHtmlSource = postUrlHtmlSource;
	}

	/**
	 * 判断是否是“没有结果”页面
	 * 
	 * @return
	 */
	@Override
	public boolean isNoSearchResultsPage() {
		boolean isNoResultPage = false;
		String noDataPageRegex = listPageConfig.getNoDataPageRegex();
		if (StringUtils.isNotBlank(noDataPageRegex)
				&& StringUtils.isNotBlank(pageHtmlSource)) {
			isNoResultPage = Regex.isContains(noDataPageRegex, pageHtmlSource);
		}
		logger.info("[" + (queryURL != null ? queryURL.getId() : "")
				+ "],isNoSearchResultsPage[" + isNoResultPage + "]");
		return isNoResultPage;
	}

	/**
	 * 添加请求头信息
	 * 
	 * @param name
	 * @param value
	 */
	public void addHttpHeader(String name, String value) {
		resetHeader.add(new BasicHeader(name, value));
	}

	/**
	 * 处理结果页面
	 * 
	 * @param eName
	 * @param qId
	 * @throws UnsupportedEncodingException
	 */
	private void processResultPage(String eName, int qId)
			throws UnsupportedEncodingException {
		jobsPartHtmlSource = sectionContainsJobsHtmlSource();
		Integer engineId = searchEngine.getId();
		boolean firstPage = isFirstListPage();
		Boolean hasNextPage = hasNextPage();
		Boolean tooLargeReturn = largeThanMaxReturn();
		Integer returnRecordNum = extractReturnRecordNum();

		logger.info(EncodeUtil.gbk2iso("[" + eName + "]:[" + qId + "] first:["
				+ firstPage + "],next:[" + hasNextPage + "],large:["
				+ tooLargeReturn + "],record:[" + returnRecordNum + "]"));

		if (firstPage && (tooLargeReturn == null || !tooLargeReturn)
				&& returnRecordNum != null
				&& (hasNextPage == null || hasNextPage)) {
			int pageSize = listPageConfig.getPageSize();
			int totalPageNum = Page.getAnyTotalPageCount(returnRecordNum,
					pageSize);
			List<String> nextSpellUrls = makeSecondStartNextSpellUrl(totalPageNum);
			List<QueryURL> queryUrls = new ArrayList<QueryURL>();
			if (CollectionUtils.isNotEmpty(nextSpellUrls)) {
				for (String nextSpellUrl : nextSpellUrls) {
					queryUrls
							.add(createQueryURL(uniformURLFormat(nextSpellUrl)));
				}
				queryURLService.batchSave(engineId, queryUrls, 1000);
			}
			logger.info(EncodeUtil.gbk2iso("[" + eName + "]:[" + qId
					+ "] is first page and [calculate] . generate url size:["
					+ CollectionUtils.size(queryUrls) + "]."));
		}

		if (firstPage && (tooLargeReturn != null && tooLargeReturn)
				&& (hasNextPage == null || hasNextPage)) {
			List<QueryURL> qualifiedURLs = generateQualifiedURLs(queryURL);
			if (CollectionUtils.isNotEmpty(qualifiedURLs)) {
				queryURLService.batchSave(engineId, qualifiedURLs, 1000);
			}
			logger.info(EncodeUtil.gbk2iso("[" + eName + "]:[" + qId
					+ "] qualified query url! qualified url size:["
					+ CollectionUtils.size(qualifiedURLs) + "]."));
			return;
		}

		if (!firstPage && (tooLargeReturn != null && tooLargeReturn)) {
			logger.info(EncodeUtil.gbk2iso("[" + eName + "]:[" + qId
					+ "] not first page and too large record. direct return."));
			return;
		}

		if ((tooLargeReturn == null || !tooLargeReturn)
				&& (hasNextPage == null || hasNextPage)
				&& !forceUseCalculateTurnPage) {
			String nextSpellUrl = makeNextSpellUrl();
			if (StringUtils.isNotBlank(nextSpellUrl)) {
				queryURLService.save(engineId,
						createQueryURL(uniformURLFormat(nextSpellUrl)));
			}
			logger.info(EncodeUtil.gbk2iso("[" + eName + "]:[" + qId
					+ "] not first page and [iteration]. nextSpellUrl:["
					+ StringUtils.abbreviate(nextSpellUrl, 4000) + "]."));
		}

		if (StringUtils.isNotBlank(jobsPartHtmlSource)) {
			jobLists = extractJobs();
			if (CollectionUtils.isNotEmpty(jobLists)) {
				setUniqueMd5(jobLists);
				setWebsiteId(jobLists);
				batchSaveJobInfos(jobLists);
				int jobsCount = CollectionUtils.size(jobLists);
				CrawlService.addJobCount(engineId, jobsCount);
				logger.info(EncodeUtil.gbk2iso("[" + eName + "]:[" + qId
						+ "] extract jobs. currentPageNo:["
						+ extractCurrentPageNo() + "],jobs size:[" + jobsCount
						+ "]"));
			} else {
				queryURL.setDoFlag(QueryURL.ERROR_RECORD_LIST_IS_EMPTY);
				logger.error(EncodeUtil.gbk2iso("[" + eName + "]:[" + qId
						+ "] jobsList is empty! constainsJobsHtmlSource:["
						+ jobsPartHtmlSource + "]"));
			}
		} else {
			queryURL.setDoFlag(QueryURL.ERROR_RECORD_LIST_HTML_SOURCE_BLANK);
			logger.error(EncodeUtil.gbk2iso("[" + eName + "]:[" + qId
					+ "] jobs html blank!"));
		}

	}

	/**
	 * 判断结果页面是否是列表的第一页
	 * 
	 * @return
	 */
	@Override
	public boolean isFirstListPage() {
		boolean returnValue = false;
		Integer currentPageNo = extractCurrentPageNo();
		if (currentPageNo != null) {
			if (StringUtils.equals(String.valueOf(currentPageNo), "1")) {
				return true;
			}
		}
		return returnValue;
	}

	/**
	 * 判断返回结果是否超过最大列表显示数量
	 * 
	 * @return
	 */
	@Override
	public Boolean largeThanMaxReturn() {
		Boolean returnValue = false;
		int maxRecordNum = listPageConfig.getMaxRecordNum();
		Integer returnRecordNum = extractReturnRecordNum();
		if (returnRecordNum == null) {
			returnValue = null;
		} else if (maxRecordNum > 0 && maxRecordNum <= returnRecordNum) {
			returnValue = true;
		}
		logger.debug("largeThanMaxReturn is:[" + returnValue + "]");
		return returnValue;
	}

	/**
	 * 添加下一个参数（减少结果集），返回URL集合
	 * 
	 * @param queryURL
	 * @return
	 * @throws UnsupportedEncodingException
	 */
	@Override
	public List<QueryURL> generateQualifiedURLs(QueryURL queryURL)
			throws UnsupportedEncodingException {
		List<QueryURL> queryURLs = null;
		CreateQueryURL createQueryURL = buildCreateQueryURLObject();
		if (createQueryURL != null) {
			queryURLs = createQueryURL.generateQualifiedURLs(queryURL);
		} else {
			logger.error("CreateQueryURL object is null. ");
		}
		return queryURLs;
	}

	/**
	 * 获取创建搜索URL实现类
	 * 
	 * @return
	 */
	protected CreateQueryURL buildCreateQueryURLObject() {
		CreateQueryURL createQueryURL = null;
		String CreateQueryURLClass = searchEngine.getCreateQueryURLClass();
		if (StringUtils.isNotBlank(CreateQueryURLClass)) {
			createQueryURL = searchEngineService
					.instantiateQueryURLClass(CreateQueryURLClass);
		}
		return createQueryURL;
	}

	/**
	 * 选择包含数据列表部分的HTML代码
	 * 
	 * @return
	 */
	@Override
	public String sectionContainsJobsHtmlSource() {
		String containsJobHtml = new String(pageHtmlSource);
		String dataRegionRegex = listPageConfig.getDataRegionRegex();
		if (StringUtils.isNotBlank(dataRegionRegex)) {
			containsJobHtml = Regex.matchSRowSField(pageHtmlSource,
					dataRegionRegex, false);
		}
		return containsJobHtml;
	}

	@Override
	public void setContainsJobsHtmlSource(String jobInfoHtmlSource) {
		this.jobsPartHtmlSource = jobInfoHtmlSource;
	}

	/**
	 * 抽取列表数据结果默认实现（通过正则表达式）
	 * 
	 * @return
	 */
	@Override
	public List<Record> extractJobs() {
		List<String> jobTitles = Regex.matchMRowSField(jobsPartHtmlSource,
				listPageConfig.getJobTitleRegex(), false);
		List<String> jobHrefs = Regex.matchMRowSField(jobsPartHtmlSource,
				listPageConfig.getJobHrefRegex(), false);
		List<String> companyNames = Regex.matchMRowSField(jobsPartHtmlSource,
				listPageConfig.getCompanyNameRegex(), false);
		List<String> companyHrefs = Regex.matchMRowSField(jobsPartHtmlSource,
				listPageConfig.getCompanyHrefRegex(), false);
		List<String> jobCitys = Regex.matchMRowSField(jobsPartHtmlSource,
				listPageConfig.getJobCityRegex(), false);
		List<String> jobDates = Regex.matchMRowSField(jobsPartHtmlSource,
				listPageConfig.getJobDateRegex(), false);

		List<Record> jobInfos = new ArrayList<Record>();
		for (int i = 0; i < jobTitles.size(); i++) {
			// Get value
			String jobTitle = jobTitles.get(i);
			String jobHref = jobHrefs.get(i);
			String companyName = companyNames.get(i);
			String companyHref = companyHrefs.get(i);
			String jobCity = jobCitys.get(i);
			String jobDate = jobDates.get(i);
			// New object and set value
			Record jobInfo = new Record();
			jobInfo.setSearchEngineId(queryURL.getSearchEngineId());
			jobInfo.setQueryUrlId(queryURL.getId());
			jobInfo.setJobTitle(jobTitle);
			jobInfo.setCompanyName(companyName);
			jobInfo.setCityText(jobCity);
			jobInfo.setJobDate(parseJobDate(jobDate));
			jobInfo.setJobTypeCode(extractJobTypeCode());
			jobInfo.setIndustryCode(extractIndustryCode());
			jobInfo.setCityCode(extractCityCode());
			jobInfo.setJobLinkURL(uniformURLFormat(jobHref));
			jobInfo.setCompanyLinkURL(uniformURLFormat(companyHref));
			jobInfos.add(jobInfo);
		}
		return jobInfos;
	}

	/**
	 * 通过搜索URL参数标记数据类型
	 * 
	 * @return
	 */
	@Override
	public String extractJobTypeCode() {
		return getParameterValue(listPageConfig.getJobTypeName());
	}

	/**
	 * 通过搜索URL参数标记数据类型
	 * 
	 * @return
	 */
	@Override
	public String extractIndustryCode() {
		return getParameterValue(listPageConfig.getIndustryName());
	}

	/**
	 * 通过搜索URL参数标记数据类型
	 * 
	 * @return
	 */
	@Override
	public String extractCityCode() {
		return getParameterValue(listPageConfig.getCityName());
	}

	/**
	 * 从页面中抽取返回记录数（用于计算式翻页）
	 * 
	 * @return
	 */
	@Override
	public Integer extractReturnRecordNum() {
		Integer returnRecordNum = null;
		String returnRecordNumRegex = listPageConfig.getReturnRecordNumRegex();
		if (StringUtils.isNotBlank(returnRecordNumRegex)) {
			String returnRecordNumStr = Regex.matchSRowSField(pageHtmlSource,
					returnRecordNumRegex, false);
			returnRecordNumStr = StringUtils.trim(returnRecordNumStr);
			if (StringUtils.isNotBlank(returnRecordNumStr)) {
				returnRecordNum = Integer.parseInt(returnRecordNumStr);
			}
		}
		return returnRecordNum;
	}

	/**
	 * 抽取当前页码
	 * 
	 * @return
	 */
	@Override
	public Integer extractCurrentPageNo() {
		Integer currPageNo = null;
		String pageNoName = listPageConfig.getCurrentPageNoName();
		if (StringUtils.isNotBlank(pageNoName)) {
			String currentPageNo = getParameterValue(pageNoName);
			if (StringUtils.isNotBlank(currentPageNo)) {
				currPageNo = Integer.parseInt(currentPageNo);
			}
		}
		return currPageNo;
	}

	/**
	 * 批量保存抓取到的列表数据
	 * 
	 * @param crawlJobs
	 */
	@Override
	public void batchSaveJobInfos(List<Record> crawlJobs) {
		crawJobService.batchSave(crawlJobs, crawlDate, tableNameForSaveJobs);
	}

	/**
	 * 判断是否存在下一页
	 * 
	 * @return
	 */
	@Override
	public Boolean hasNextPage() {
		Integer realReturnNum = extractReturnRecordNum();
		Integer currPageNo = extractCurrentPageNo();
		if (realReturnNum == null || currPageNo == null) {
			return null;
		}
		if (currPageNo < Page.getAnyTotalPageCount(realReturnNum,
				listPageConfig.getPageSize())) {
			return true;
		} else {
			return false;
		}
	}

	/**
	 * 生成下一页的搜索URL
	 * 
	 * @return
	 */
	@Override
	public String makeNextSpellUrl() {
		String name = listPageConfig.getCurrentPageNoName();
		String newValue = String.valueOf(extractCurrentPageNo() + 1);
		String nextSpellUrl = replaceParameterValue(queryURL.getSpellUrl(),
				name, newValue);
		logger.debug("makeNextSpellUrl is:[" + nextSpellUrl + "]");
		return nextSpellUrl;
	}

	/**
	 * 生成下一页开始到最后一页的全部搜索URL
	 * 
	 * @param totalPageNum
	 * @return
	 */
	@Override
	public List<String> makeSecondStartNextSpellUrl(int totalPageNum) {
		List<String> nextAllSpellUrls = new ArrayList<String>();
		String name = listPageConfig.getCurrentPageNoName();
		for (int pageNo = SECOND_PAGE_NO; pageNo <= totalPageNum; pageNo++) {
			nextAllSpellUrls.add(replaceParameterValue(queryURL.getSpellUrl(),
					name, String.valueOf(pageNo)));
		}
		return nextAllSpellUrls;
	}

	/**
	 * 通过URL创建搜索URL对象
	 * 
	 * @param spellUrl
	 * @return
	 */
	@Override
	public QueryURL createQueryURL(String spellUrl) {
		return new QueryURL(queryURL.getSearchEngineId(), spellUrl, spellUrl);
	}

	/**
	 * 根据当前提交页面和相对URL地址，获取绝对URL地址
	 * 
	 * @param url
	 * @return
	 */
	protected String uniformURLFormat(String url) {
		String lowerUrl = StringUtils.lowerCase(url);
		if (!(StringUtils.startsWith(lowerUrl, "http://") || StringUtils
				.startsWith(lowerUrl, "https://"))) {
			try {
				URI u = URIUtils.resolve(new URI(queryURL.getPostUrl()), url);
				return u.toString();
			} catch (URISyntaxException e) {
				logger.error("", e);
			}
		}
		return url;
	}

	/**
	 * 从URL中获取指定参数值
	 * 
	 * @param url
	 * @param name
	 * @param urlEncode
	 * @return
	 */
	protected String getParameterValueFromUrl(String url, String name,
			String urlEncode) {
		String returnValue = "";
		try {
			List<NameValuePair> pairs = URLEncodedUtils.parse(new URI(url),
					urlEncode);
			if (CollectionUtils.isNotEmpty(pairs)) {
				for (NameValuePair pair : pairs) {
					if (StringUtils.equals(pair.getName(), name)) {
						returnValue = pair.getValue() != null ? pair.getValue()
								: "";
						break;
					}
				}
			}
		} catch (URISyntaxException e) {
			logger.error("URISyntaxException", e);
		}
		return returnValue;
	}

	/**
	 * 替换URL中指定参数的参数值，获取新的URL
	 * 
	 * @param spellUrl
	 * @param name
	 * @param newValue
	 * @return
	 */
	protected String replaceParameterValue(String spellUrl, String name,
			String newValue) {
		logger.debug("replaceParameterValue before:[" + spellUrl + "]");
		String returnValue = Regex.replace(spellUrl, name + "=[^&]*", name
				+ "=" + newValue, 1);
		logger.debug("replaceParameterValue after:[" + returnValue + "]");
		return returnValue;
	}

	/**
	 * 列表数据中的时间字符串解析
	 * 
	 * @param dateStr
	 * @return
	 */
	protected Date parseJobDate(String dateStr) {
		Date date = null;
		String datePatterns = listPageConfig.getJobDatePattern();
		if (StringUtils.isNotBlank(datePatterns)) {
			String[] datePatternsArr = StringUtils.split(datePatterns, ',');
			try {
				date = DateUtils.parseDate(dateStr, datePatternsArr);
				// PROC 1970
				Calendar c = Calendar.getInstance();
				int currYear = c.get(Calendar.YEAR);
				c.setTime(date);
				int oldYear = c.get(Calendar.YEAR);
				if (oldYear == 1970) {
					c.set(Calendar.YEAR, currYear);
					date = c.getTime();
				}
			} catch (ParseException e) {
				logger.error("dateStr:[" + dateStr + "],datePatternsArr:["
						+ ArrayUtils.toString(datePatternsArr) + "]", e);
			}
		}
		return date;
	}

	/**
	 * 是否强制开启计算式翻页的标志
	 * 
	 * @param status
	 */
	@Override
	public void setForceUseCalculateTurnPage(boolean status) {
		forceUseCalculateTurnPage = status;
	}

	@Override
	public boolean getForceUseCalculateTurnPage() {
		return forceUseCalculateTurnPage;
	}

	/**
	 * 截取URL中的sessionid参数
	 * 
	 * @param url
	 * @return
	 */
	protected String substringSessionId(String url) {
		return StringUtils.substringBefore(url, ";jsessionid=");
	}

	/**
	 * 为列表数据中的每条数据设置唯一的MD5，用来做数据排重
	 * 
	 * @param crawJobs
	 */
	private void setUniqueMd5(List<Record> crawJobs) {
		if (CollectionUtils.isNotEmpty(crawJobs)) {
			Iterator<Record> it = crawJobs.iterator();
			while (it.hasNext()) {
				Record crawJob = it.next();
				try {
					crawJob.setUniqueMd5(makeCrawlJobUniqueMd5(crawJob));
				} catch (Exception e) {
					logger.error("queryUrlId:[" + queryURL.getId() + "]", e);
					it.remove();
				}
			}
		}
	}

	/**
	 * 设置网站ID
	 * 
	 * @param crawJobs
	 */
	private void setWebsiteId(List<Record> crawJobs) {
		if (CollectionUtils.isNotEmpty(crawJobs)) {
			Iterator<Record> it = crawJobs.iterator();
			while (it.hasNext()) {
				Record crawJob = it.next();
				try {
					crawJob.setWebsiteId(this.searchEngine.getWebsiteId());
				} catch (Exception e) {
					logger.error("queryUrlId:[" + queryURL.getId() + "]", e);
					it.remove();
				}
			}
		}
	}

	/**
	 * 为每条记录生成MD5，唯一指纹
	 * 
	 * @param crawJob
	 * @return
	 */
	private String makeCrawlJobUniqueMd5(Record crawJob) {
		int websiteId = searchEngine.getWebsiteId();
		String companyName = crawJob.getCompanyName();
		String jobTitle = crawJob.getJobTitle();
		String jobCityText = crawJob.getCityText();
		if (StringUtils.isBlank(jobCityText)) {
			jobCityText = "nohave";
		}
		if (websiteId == 0 || StringUtils.isBlank(companyName)
				|| StringUtils.isBlank(jobTitle)
				|| StringUtils.isBlank(jobCityText)) {
			String errorMsg = "Make crawl job unique md5 error!!!   USE:websiteId,companyName,jobTitle,jobCityText MAKE MD5 >> crawJob:["
					+ crawJob + "].";
			throw new RuntimeException(errorMsg);
		}
		StringBuffer crawJobMd5Key = new StringBuffer();
		crawJobMd5Key.append(websiteId);
		crawJobMd5Key.append(companyName);
		crawJobMd5Key.append(jobTitle);
		crawJobMd5Key.append(jobCityText);
		return MD5.Md5(crawJobMd5Key.toString(), "UTF-8");
	}

	/**
	 * 在当前搜索URL中获取指定参数值
	 * 
	 * @param parameterName
	 * @return
	 */
	protected String getParameterValue(String parameterName) {
		String value = "";
		if (StringUtils.isNotBlank(parameterName)) {
			value = getParameterValueFromUrl(queryURL.getSpellUrl(),
					parameterName, urlEncode);
			if (StringUtils.isBlank(value)) {
				value = getParameterValueFromUrl(queryURL.getPostUrl(),
						parameterName, urlEncode);
			}
		}
		return value;
	}

}
